This notebook walks through the creation of multitask models on MUV. The goal is to demonstrate that multitask methods outperform singletask methods on MUV.
In [1]:
%reload_ext autoreload
%autoreload 2
%pdb off
reload = True
In [2]:
from deepchem.utils.save import load_from_disk
from deepchem.datasets import Dataset
dataset_file= "../datasets/muv.csv.gz"
dataset = load_from_disk(dataset_file)
print("Columns of dataset: %s" % str(dataset.columns.values))
print("Number of examples in dataset: %s" % str(dataset.shape[0]))
Now, let's visualize some compounds from our dataset
In [3]:
from itertools import islice
from rdkit import Chem
from deepchem.utils.visualization import mols_to_pngs
from deepchem.utils.visualization import display_images
num_to_display = 12
molecules = []
for _, data in islice(dataset.iterrows(), num_to_display):
molecules.append(Chem.MolFromSmiles(data["smiles"]))
display_images(mols_to_pngs(molecules))
In [4]:
from deepchem.featurizers.fingerprints import CircularFingerprint
featurizers = [CircularFingerprint(size=1024)]
In [5]:
MUV_tasks = ['MUV-692', 'MUV-689', 'MUV-846', 'MUV-859', 'MUV-644',
'MUV-548', 'MUV-852', 'MUV-600', 'MUV-810', 'MUV-712',
'MUV-737', 'MUV-858', 'MUV-713', 'MUV-733', 'MUV-652',
'MUV-466', 'MUV-832']
In [6]:
import os
from deepchem.featurizers.featurize import DataFeaturizer
# The base_dir holds the results of all analysis
base_dir = "/scratch/users/rbharath/muv_multitask_analysis"
#Make directories to store the raw and featurized datasets.
feature_dir = os.path.join(base_dir, "features")
samples_dir = os.path.join(base_dir, "samples")
featurizer = DataFeaturizer(tasks=MUV_tasks,
smiles_field="smiles",
compound_featurizers=featurizers,
verbosity="low")
# Setting reload=True directs the featurizer to use existing featurization on disk if such exists.
featurized_samples = featurizer.featurize(dataset_file, feature_dir, samples_dir, shard_size=4096,
reload=reload)
In [7]:
splittype = "scaffold"
train_dir = os.path.join(base_dir, "train_dataset")
valid_dir = os.path.join(base_dir, "valid_dataset")
test_dir = os.path.join(base_dir, "test_dataset")
train_samples, valid_samples, test_samples = featurized_samples.train_valid_test_split(
splittype, train_dir, valid_dir, test_dir, log_every_n=1000, reload=reload)
In [8]:
from deepchem.datasets import Dataset
print("Creating train dataset")
verbosity = None
train_dataset = Dataset(data_dir=train_dir, samples=train_samples,
featurizers=featurizers, tasks=MUV_tasks,
verbosity=verbosity, reload=reload)
print("Creating valid dataset")
valid_dataset = Dataset(data_dir=valid_dir, samples=valid_samples,
featurizers=featurizers, tasks=MUV_tasks,
verbosity=verbosity, reload=reload)
print("Creating test dataset")
test_dataset = Dataset(data_dir=test_dir, samples=test_samples,
featurizers=featurizers, tasks=MUV_tasks,
verbosity=verbosity, reload=reload)
In [9]:
input_transformers = []
output_transformers = []
In [10]:
from deepchem.hyperparameters import HyperparamOpt
from deepchem.models.tensorflow_models import TensorflowModel
from deepchem.models.tensorflow_models.fcnet import TensorflowMultiTaskClassifier
from deepchem import metrics
from deepchem.metrics import Metric
import numpy as np
import numpy.random
model_dir = os.path.join(base_dir, "model")
MUV_task_types = {task: "Classification" for task in MUV_tasks}
params_dict = {"activation": ["relu"],
"momentum": [.9],
"batch_size": [50],
"init": ["glorot_uniform"],
"data_shape": [train_dataset.get_data_shape()],
"learning_rate": [1e-3],
"decay": [1e-6],
"nb_hidden": [1000],
"nb_epoch": [1],
"nesterov": [False],
"dropouts": [(.5,)],
"nb_layers": [1],
"batchnorm": [False],
"layer_sizes": [(1000,)],
"weight_init_stddevs": [(.1,)],
"bias_init_consts": [(1.,)],
"num_classes": [2],
"penalty": [0.],
"optimizer": ["sgd"],
"num_classification_tasks": [len(MUV_task_types)]
}
def model_builder(task_types, params_dict, logdir, verbosity=None):
return TensorflowModel(
task_types, params_dict, logdir,
tf_class=TensorflowMultiTaskClassifier,
verbosity=verbosity)
metric = Metric(metrics.roc_auc_score, np.mean)
optimizer = HyperparamOpt(model_builder, MUV_task_types, verbosity="low")
best_dnn, best_hyperparams, all_results = optimizer.hyperparam_search(
params_dict, train_dataset, valid_dataset, output_transformers, metric, logdir=model_dir)
In [ ]: